In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
%matplotlib inline


#html export
import plotly.io as pio
pio.renderers.default = 'notebook'
In [2]:
df = pd.read_excel('TRAFFIC ACCIDENTS DATA.xlsx')
df
Out[2]:
Date Accident Spot Area County Road/ Highway Brief Accident Details/Cause Victims Total people confirmed dead Time of the Accidents Weather conditions
0 2023-08-08 Sobea Sobea Nakuru Nakuru-Eldoret Highway Head on Collision Passengers 4 4.30 pm NaN
1 2023-08-07 Maai-Mahiu Naivasha Nakuru Maai-Mahiu Naivasha Highway vehicle and motorcycle collision Passengers 1 5.50 pm NaN
2 2023-07-25 Ntulele Ntulele Narok Narok Mai Mahiu road Head on Collision Drivers/Occupants 4 NaN NaN
3 2022-12-02 Suswa Suswa Narok Narok Mai Mahiu road Head on Collision Driver and passengers 3 6.00 pm NaN
4 2022-12-01 Mutira Mutira Kirinyaga Kerugoya-Karatina Road Run over Pedestrian 1 NaN NaN
... ... ... ... ... ... ... ... ... ... ...
76 2022-06-03 Losengeli Sabatia Vihiga Nairobi Kakamega road Bus lost control and rolled over Passengers 1 NaN NaN
77 2022-05-31 Isinya Isinya Kajiado Namanga Road Car rammed into a trailer Passengers 1 11.00 pm NaN
78 2022-05-30 Kinugi Kinugi Nakuru Naivasha Nairobi Highway Truck driver collided with an oncoming matatu Passengers 1 NaN NaN
79 2022-02-28 Kinugi Kinugi Nakuru Nakuru - Nairobi highway Car rammed into a lorry Passengers 4 NaN NaN
80 2023-08-08 Sobea Sobea Nakuru Nakuru Eldoret Highway matatu collided with a truck Passengers 2 4.30 am NaN

81 rows × 10 columns

Columns and Description¶

  • Date - Contains dates of the accidents
  • Accident Spot - Describes the specific location of the accident
  • Area - General area where the accident occurred.
  • County - County in which the accident happened.
  • Road/ Highway - Road or highway where the accident occurred.
  • Brief Accident Details/Cause - Short description of the accident cause or details.
  • Victims - Specifies the type of victims involved (e.g., passengers, drivers).
  • Total people confirmed dead - Indicates the number of fatalities.
  • Time of the Accidents - The time when the accidents occurred.
  • Weather conditions - Details about weather conditions during the accidents.

Preparing the dataset¶

In [3]:
df.columns
Out[3]:
Index(['Date', 'Accident Spot', 'Area', 'County', 'Road/ Highway',
       'Brief Accident Details/Cause', 'Victims',
       'Total people confirmed dead', 'Time of the Accidents',
       'Weather conditions'],
      dtype='object')
In [4]:
# make column names and values uniform
df.rename(columns={'Road/ Highway': 'Road/Highway'}, inplace=True) # renaming a column

df[ 'Total people confirmed dead'] = df[ 'Total people confirmed dead'].apply(pd.to_numeric, errors='coerce')

df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = df.dtypes[df.dtypes == 'object'].index
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

df
Out[4]:
date accident_spot area county road/highway brief_accident_details/cause victims total_people_confirmed_dead time_of_the_accidents weather_conditions
0 2023-08-08 sobea sobea nakuru nakuru-eldoret_highway head_on_collision passengers 4.0 4.30_pm NaN
1 2023-08-07 maai-mahiu naivasha nakuru maai-mahiu_naivasha_highway vehicle_and_motorcycle_collision passengers 1.0 5.50_pm NaN
2 2023-07-25 ntulele ntulele narok narok_mai_mahiu_road head_on_collision drivers/occupants 4.0 NaN NaN
3 2022-12-02 suswa suswa narok narok_mai_mahiu_road head_on_collision driver_and_passengers 3.0 6.00_pm NaN
4 2022-12-01 mutira mutira kirinyaga kerugoya-karatina_road run_over pedestrian 1.0 NaN NaN
... ... ... ... ... ... ... ... ... ... ...
76 2022-06-03 losengeli sabatia vihiga nairobi_kakamega_road bus_lost_control_and_rolled_over passengers 1.0 NaN NaN
77 2022-05-31 isinya isinya kajiado namanga_road car_rammed_into_a_trailer passengers 1.0 11.00_pm NaN
78 2022-05-30 kinugi kinugi nakuru naivasha_nairobi_highway truck_driver_collided_with_an_oncoming_matatu passengers 1.0 NaN NaN
79 2022-02-28 kinugi kinugi nakuru nakuru_-_nairobi_highway_ car_rammed_into_a_lorry passengers 4.0 NaN NaN
80 2023-08-08 sobea sobea nakuru nakuru_eldoret_highway matatu_collided_with_a_truck passengers 2.0 4.30_am NaN

81 rows × 10 columns

In [5]:
df.dtypes
Out[5]:
date                            datetime64[ns]
accident_spot                           object
area                                    object
county                                  object
road/highway                            object
brief_accident_details/cause            object
victims                                 object
total_people_confirmed_dead            float64
time_of_the_accidents                   object
weather_conditions                      object
dtype: object
In [6]:
# converting specified columns

# Convert to integer
df['total_people_confirmed_dead'] = df['total_people_confirmed_dead'].astype('Int64')

# Convert to 24-hour time format
# Step 1: Replace time strings to make them compatible
df['time_of_the_accidents'] = df['time_of_the_accidents'].replace(
    r'(\d{1,2})\.(\d{1,2})_?([ap]m)',  # Allow missing underscore
    r'\1:\2 \3',
    regex=True
)

# Step 2: Convert to datetime, handling AM/PM
df['time_of_the_accidents'] = pd.to_datetime(
    df['time_of_the_accidents'], format='%I:%M %p', errors='coerce'
)

# Step 3: Extract only the time in HH:MM format
df['time_of_the_accidents'] = df['time_of_the_accidents'].dt.strftime('%H:%M')


df
Out[6]:
date accident_spot area county road/highway brief_accident_details/cause victims total_people_confirmed_dead time_of_the_accidents weather_conditions
0 2023-08-08 sobea sobea nakuru nakuru-eldoret_highway head_on_collision passengers 4 16:30 NaN
1 2023-08-07 maai-mahiu naivasha nakuru maai-mahiu_naivasha_highway vehicle_and_motorcycle_collision passengers 1 17:50 NaN
2 2023-07-25 ntulele ntulele narok narok_mai_mahiu_road head_on_collision drivers/occupants 4 NaN NaN
3 2022-12-02 suswa suswa narok narok_mai_mahiu_road head_on_collision driver_and_passengers 3 18:00 NaN
4 2022-12-01 mutira mutira kirinyaga kerugoya-karatina_road run_over pedestrian 1 NaN NaN
... ... ... ... ... ... ... ... ... ... ...
76 2022-06-03 losengeli sabatia vihiga nairobi_kakamega_road bus_lost_control_and_rolled_over passengers 1 NaN NaN
77 2022-05-31 isinya isinya kajiado namanga_road car_rammed_into_a_trailer passengers 1 23:00 NaN
78 2022-05-30 kinugi kinugi nakuru naivasha_nairobi_highway truck_driver_collided_with_an_oncoming_matatu passengers 1 NaN NaN
79 2022-02-28 kinugi kinugi nakuru nakuru_-_nairobi_highway_ car_rammed_into_a_lorry passengers 4 NaN NaN
80 2023-08-08 sobea sobea nakuru nakuru_eldoret_highway matatu_collided_with_a_truck passengers 2 04:30 NaN

81 rows × 10 columns

In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   date                          80 non-null     datetime64[ns]
 1   accident_spot                 81 non-null     object        
 2   area                          81 non-null     object        
 3   county                        81 non-null     object        
 4   road/highway                  80 non-null     object        
 5   brief_accident_details/cause  79 non-null     object        
 6   victims                       63 non-null     object        
 7   total_people_confirmed_dead   71 non-null     Int64         
 8   time_of_the_accidents         25 non-null     object        
 9   weather_conditions            1 non-null      object        
dtypes: Int64(1), datetime64[ns](1), object(8)
memory usage: 6.5+ KB

Data Cleaning¶

In [8]:
# duplicates
df.duplicated().sum()
Out[8]:
1
In [9]:
#dropping duplicates
df = df.drop_duplicates()
In [10]:
df.duplicated().sum()
Out[10]:
0
In [11]:
# nulls
df.isna().sum()
Out[11]:
date                             1
accident_spot                    0
area                             0
county                           0
road/highway                     1
brief_accident_details/cause     2
victims                         17
total_people_confirmed_dead      9
time_of_the_accidents           55
weather_conditions              79
dtype: int64
In [12]:
(df['weather_conditions'].isna().sum()/len(df)) * 100
Out[12]:
98.75

Approximately 99% of the 'weather_conditions' column is null hence safe to drop this column

In [13]:
# dropping 'weather_conditions' column
del df['weather_conditions']
In [14]:
# date
df[df['date'].isna()]
Out[14]:
date accident_spot area county road/highway brief_accident_details/cause victims total_people_confirmed_dead time_of_the_accidents
66 NaT ndarugo ndarugo kiambu thika_road_ truck_lost_control_and_overturned NaN 0 10:00
In [15]:
df.iloc[60:70]
Out[15]:
date accident_spot area county road/highway brief_accident_details/cause victims total_people_confirmed_dead time_of_the_accidents
61 2023-04-01 amabuko kroka_town kisii kisii_keroka_road matatu_collided_with_a_trailer passengers 5 NaN
62 2023-03-30 kayole_bridge nakuru nakuru nakuru_-_nairobi_highway_ bus_collided_with_a_matatu passengers 18 NaN
63 2023-03-31 burguret_shopping_centre burguret muranga naromoru_nanyuki_road bus_overturned passengers 0 NaN
64 2023-04-18 maragua maragua muranga kenol_muranga_road matatu_collided_with_a_lorry passengers 2 NaN
65 2023-08-08 migori_town migori_town migori migori_isibania_road truck_failed_brakes_and_ran_into_ither_vehicles pedestrians 8 07:30
66 NaT ndarugo ndarugo kiambu thika_road_ truck_lost_control_and_overturned NaN 0 10:00
67 2023-02-04 kakwamunyen kakuma turkana lodwar_kakuma_road driver_swerved_to_avoid_on_hitting_a_camel_on_... passengers 14 22:00
68 2023-01-29 mamboleo mamboleo kisumu kakamega_kisumu_road bus_veered_off_the_road_and_landed_in_a_ditch passengers 0 NaN
69 2023-01-26 kikopey kikopey nakuru nakuru_-_nairobi_highway_ a_matatu_rammed_into_a_lorry passengers 7 NaN
70 2023-09-08 iibisil_towm iibisil kajiado namanga_road sallon_car_rammed_iinto_a_lorry passengers 3 NaN
In [16]:
# the sorrounding data for the null date is around the same period
# Use .loc to fill the 'date' column forward
df.loc[:, 'date'] = df['date'].ffill()
In [17]:
df.isna().sum()
Out[17]:
date                             0
accident_spot                    0
area                             0
county                           0
road/highway                     1
brief_accident_details/cause     2
victims                         17
total_people_confirmed_dead      9
time_of_the_accidents           55
dtype: int64
In [18]:
# road/highway
df[df['road/highway'].isna()]
Out[18]:
date accident_spot area county road/highway brief_accident_details/cause victims total_people_confirmed_dead time_of_the_accidents
72 2022-07-16 koru koru kisumu NaN bus_rolled_as_it_tried_avoiding_collision_with... passengers 0 14:00
In [19]:
df[df['county'] == 'kisumu']
Out[19]:
date accident_spot area county road/highway brief_accident_details/cause victims total_people_confirmed_dead time_of_the_accidents
22 2022-06-15 koru muhoroni kisumu kisumu_highway_ bus_lost_control_and_rolled passengers_and_driver <NA> NaN
24 2023-08-29 ojolla ojolla kisumu kisumu_-_busia_road head_on_collision_ passengers_and_drivers <NA> NaN
39 2023-08-28 ojola ojola kisumu kisumu_-_busia_road driver_tried_to_overtake,_rammed_into_a_traile... passenger 1 NaN
68 2023-01-29 mamboleo mamboleo kisumu kakamega_kisumu_road bus_veered_off_the_road_and_landed_in_a_ditch passengers 0 NaN
71 2022-08-30 coptic_round coptic kisumu kisumu_kakamega_road truck_lost_control_and_overturned pedestrians 3 NaN
72 2022-07-16 koru koru kisumu NaN bus_rolled_as_it_tried_avoiding_collision_with... passengers 0 14:00

There is no much supporting data for the null 'road/highway' column hence safe to drop it

In [20]:
# Remove rows where 'road/highway' is null
df = df.dropna(subset=['road/highway'])
df = df.reset_index(drop=True)
In [21]:
df.isna().sum()
Out[21]:
date                             0
accident_spot                    0
area                             0
county                           0
road/highway                     0
brief_accident_details/cause     2
victims                         17
total_people_confirmed_dead      9
time_of_the_accidents           55
dtype: int64
In [22]:
# brief_accident_details/cause
df[df['brief_accident_details/cause'].isna()]
Out[22]:
date accident_spot area county road/highway brief_accident_details/cause victims total_people_confirmed_dead time_of_the_accidents
43 2023-04-02 masaba masaba kisii kisii_keroka_road NaN NaN <NA> NaN
58 2023-04-21 mau_summit kuresoi_north nakuru nakuru_eldoret_highway NaN pedestrian 1 18:30
In [23]:
df[df['road/highway'] == 'kisii_keroka_road']
Out[23]:
date accident_spot area county road/highway brief_accident_details/cause victims total_people_confirmed_dead time_of_the_accidents
43 2023-04-02 masaba masaba kisii kisii_keroka_road NaN NaN <NA> NaN
60 2023-04-01 amabuko kroka_town kisii kisii_keroka_road matatu_collided_with_a_trailer passengers 5 NaN
In [24]:
# Filter rows where road/highway is "kisii_keroka_road"
kisii_keroka_road_rows = df[df['road/highway'] == 'kisii_keroka_road']

# Use loc to apply backward fill on the slice
df.loc[kisii_keroka_road_rows.index, 'brief_accident_details/cause'] = kisii_keroka_road_rows['brief_accident_details/cause'].bfill()
In [25]:
df[df['road/highway'] == 'nakuru_eldoret_highway']
Out[25]:
date accident_spot area county road/highway brief_accident_details/cause victims total_people_confirmed_dead time_of_the_accidents
26 2023-07-09 ngata_bridge ngata nakuru nakuru_eldoret_highway matatu_and_truck_head_on_collision passenger 1 22:00
30 2023-05-15 sachangwan sachangwan bomet nakuru_eldoret_highway matatu_hit_a_bump,_veered_off_the_road_and_lan... NaN 5 02:00
35 2023-06-24 migaa mau_hills nakuru nakuru_eldoret_highway matatu_driver_lot_ontrol_and_hit_a_lorry driver_and_passengers 3 NaN
37 2023-09-26 ainabkoi_intersection ainabkoi uansin_gishi nakuru_eldoret_highway bus_collided_with_a_saloon_car NaN 0 NaN
48 2023-08-08 sobea sobea nakuru nakuru_eldoret_highway matatu_collided_with_a_truck passengers 2 NaN
56 2023-06-24 migaa mau_hills nakuru nakuru_eldoret_highway matatu_driver_lost_control_and_hit_a_lorry passenger 3 NaN
57 2023-05-15 sachangwan sachangwan nakuru nakuru_eldoret_highway matatu_hit_a_bump_and_veered_off_the_road passengers 5 02:00
58 2023-04-21 mau_summit kuresoi_north nakuru nakuru_eldoret_highway NaN pedestrian 1 18:30
78 2023-08-08 sobea sobea nakuru nakuru_eldoret_highway matatu_collided_with_a_truck passengers 2 04:30
In [26]:
# Filter rows where road/highway is "knakuru_eldoret_highway"
nakuru_eldoret_highway_rows = df[df['road/highway'] == 'nakuru_eldoret_highway']

# Use loc to apply forward fill on the slice as tha cause above would fit the victim
df.loc[nakuru_eldoret_highway_rows.index, 'brief_accident_details/cause'] = nakuru_eldoret_highway_rows['brief_accident_details/cause'].ffill()
In [27]:
df.isna().sum()
Out[27]:
date                             0
accident_spot                    0
area                             0
county                           0
road/highway                     0
brief_accident_details/cause     0
victims                         17
total_people_confirmed_dead      9
time_of_the_accidents           55
dtype: int64
In [28]:
# total_people_confirmed_dead
df[df['total_people_confirmed_dead'].isna()]
Out[28]:
date accident_spot area county road/highway brief_accident_details/cause victims total_people_confirmed_dead time_of_the_accidents
12 2021-02-23 nithi nithi_bridge tharaka_nithi meru_embu_road head_on_collision NaN <NA> 02:00
14 2020-08-17 river_maara river_maara tharaka_nithi meru_embu_road vehicle_rolled_into_the_river NaN <NA> NaN
21 2023-09-27 sotik sotik bomet bomet_-_sotik_highway bodaboda_rider_hit_by_matatu rider <NA> NaN
22 2022-06-15 koru muhoroni kisumu kisumu_highway_ bus_lost_control_and_rolled passengers_and_driver <NA> NaN
23 2021-08-29 londiani londiani nakuru nakuru_kericho_highway vehicle_lost_control_nd_rolled NaN <NA> NaN
24 2023-08-29 ojolla ojolla kisumu kisumu_-_busia_road head_on_collision_ passengers_and_drivers <NA> NaN
25 2021-08-25 greensteads greensteads nakuru nakuru_-_nairobi_highway_ head_on_collision passengers <NA> NaN
27 2023-01-13 ratili ratili_narok_south narok narok_road saloom_car_rolled passangers_and_driver <NA> NaN
43 2023-04-02 masaba masaba kisii kisii_keroka_road matatu_collided_with_a_trailer NaN <NA> NaN

If we have no information on the victims, then we cant assign a number to the total people confirmed dead hence we drop rows where both victims and total confirmed dead are null

In [29]:
df[df['victims'].isna() & df['total_people_confirmed_dead'].isna()]
Out[29]:
date accident_spot area county road/highway brief_accident_details/cause victims total_people_confirmed_dead time_of_the_accidents
12 2021-02-23 nithi nithi_bridge tharaka_nithi meru_embu_road head_on_collision NaN <NA> 02:00
14 2020-08-17 river_maara river_maara tharaka_nithi meru_embu_road vehicle_rolled_into_the_river NaN <NA> NaN
23 2021-08-29 londiani londiani nakuru nakuru_kericho_highway vehicle_lost_control_nd_rolled NaN <NA> NaN
43 2023-04-02 masaba masaba kisii kisii_keroka_road matatu_collided_with_a_trailer NaN <NA> NaN
In [30]:
# Drop rows where both 'victims' and 'total_people_confirmed_dead' are null
df = df.dropna(subset=['victims', 'total_people_confirmed_dead'], how='all')
In [31]:
df[df['total_people_confirmed_dead'].isna()]
Out[31]:
date accident_spot area county road/highway brief_accident_details/cause victims total_people_confirmed_dead time_of_the_accidents
21 2023-09-27 sotik sotik bomet bomet_-_sotik_highway bodaboda_rider_hit_by_matatu rider <NA> NaN
22 2022-06-15 koru muhoroni kisumu kisumu_highway_ bus_lost_control_and_rolled passengers_and_driver <NA> NaN
24 2023-08-29 ojolla ojolla kisumu kisumu_-_busia_road head_on_collision_ passengers_and_drivers <NA> NaN
25 2021-08-25 greensteads greensteads nakuru nakuru_-_nairobi_highway_ head_on_collision passengers <NA> NaN
27 2023-01-13 ratili ratili_narok_south narok narok_road saloom_car_rolled passangers_and_driver <NA> NaN
In [32]:
df[df['victims'] == 'rider']
Out[32]:
date accident_spot area county road/highway brief_accident_details/cause victims total_people_confirmed_dead time_of_the_accidents
21 2023-09-27 sotik sotik bomet bomet_-_sotik_highway bodaboda_rider_hit_by_matatu rider <NA> NaN
In [33]:
# Fill 'total_people_confirmed_dead' with 1 where 'victims' is 'rider' 
df.loc[df['victims'] == 'rider', 'total_people_confirmed_dead'] = 1
In [34]:
# central tendancies of total people confirmed dead
max = df['total_people_confirmed_dead'].max()
min = df['total_people_confirmed_dead'].min()

mean = np.mean(df['total_people_confirmed_dead'])
mode = df['total_people_confirmed_dead'].mode()[0]
median = df['total_people_confirmed_dead'].median()

print(f"Max: {max}")
print(f"Min: {min}")
print(f"Mean: {mean}")
print(f"Mode: {mode}")
print(f"Median: {median}")
Max: 52
Min: 0
Mean: 4.957746478873239
Mode: 1
Median: 3.0
In [35]:
# trying to see the distribution of dead people values
px.box(df['total_people_confirmed_dead'])

The distribution has outliers hence it would not be appropriate to replace with mean but rather the median as the nulls have both passengers and driver as the victims meaning they are more than one

In [36]:
# Replace nulls in 'total_people_confirmed_dead' with the column's median
df['total_people_confirmed_dead'] = df['total_people_confirmed_dead'].fillna(df['total_people_confirmed_dead'].median())
In [37]:
df.isna().sum()
Out[37]:
date                             0
accident_spot                    0
area                             0
county                           0
road/highway                     0
brief_accident_details/cause     0
victims                         13
total_people_confirmed_dead      0
time_of_the_accidents           52
dtype: int64
In [38]:
# victims
df['victims'].value_counts()
Out[38]:
victims
passengers                            36
driver_and_passengers                  4
passengers_and_driver                  3
passenger                              3
pedestrians                            3
pedestrian                             2
occupants_of_the_small_vehicle_        2
passengers_and_drivers                 2
drivers/occupants                      1
passengers_and_pedestrians_            1
drivers,passengers_and_pedestrians     1
rider                                  1
passangers_and_driver                  1
driver_and_other_motorists_            1
motorist                               1
Name: count, dtype: int64
In [39]:
# Dictionary to map similar names to a unified value
mapping_dict = {
    'passengers': 'passengers',
    'driver_and_passengers': 'driver_and_passengers',
    'passengers_and_driver': 'driver_and_passengers',
    'passenger': 'passengers',
    'pedestrians': 'pedestrians',
    'pedestrian': 'pedestrians',
    'occupants_of_the_small_vehicle_': 'occupants_of_the_small_vehicle_',
    'passengers_and_drivers': 'driver_and_passengers',
    'drivers/occupants': 'drivers/occupants',
    'passengers_and_pedestrians_': 'passengers_and_pedestrians',
    'drivers,passengers_and_pedestrians': 'driver_passenger_pedestrian',
    'rider': 'motorist',
    'passangers_and_driver': 'driver_and_passengers',
    'driver_and_other_motorists_': 'driver_and_motorists',
    'motorist': 'motorist'
}

# Replace values in the 'victims' column based on the mapping
df['victims'] = df['victims'].replace(mapping_dict)

# Check updated value counts
df['victims'].value_counts()
Out[39]:
victims
passengers                         39
driver_and_passengers              10
pedestrians                         5
occupants_of_the_small_vehicle_     2
motorist                            2
drivers/occupants                   1
passengers_and_pedestrians          1
driver_passenger_pedestrian         1
driver_and_motorists                1
Name: count, dtype: int64
In [40]:
df[df['victims'].isna()]
Out[40]:
date accident_spot area county road/highway brief_accident_details/cause victims total_people_confirmed_dead time_of_the_accidents
28 2023-05-22 isinya isinya kajiado isinya_kiserian_road bus_lost_control_and_rolled_into_a_ditch NaN 7 NaN
30 2023-05-15 sachangwan sachangwan bomet nakuru_eldoret_highway matatu_hit_a_bump,_veered_off_the_road_and_lan... NaN 5 02:00
36 2023-09-27 laisamis laisamis isiolo marsabit_isiolo_highway vehicle_veered_off_rhe_road_after_tyre_burst NaN 2 NaN
37 2023-09-26 ainabkoi_intersection ainabkoi uansin_gishi nakuru_eldoret_highway bus_collided_with_a_saloon_car NaN 0 NaN
42 2023-04-16 josa mwatate taita_taveta mombasa_-_nairobi_highway_ head_on_collision NaN 10 NaN
44 2023-09-18 gitaru gitaru kiambu nakuru_-_nairobi_highway_ driver_lost_control_and_hit_barrier NaN 0 NaN
45 2023-09-08 malili malili makueni mombasa_-_nairobi_highway_ involved_a_truck,_lorry_and_saloon_car NaN 0 16:20
47 2023-08-30 nairagie-enkare narok narok narok_mai_mahiu_road bus_burst_into_flames NaN 0 NaN
51 2023-07-04 mau_summit kuresoi_north nakuru nakuru_kericho_highway bus_lost_control_and_landed_in_a_ditch NaN 0 NaN
52 2022-04-26 kirinyaga kirinyaga kirinyaga rukenya_kimunye_road bus_overturned NaN 0 NaN
55 2021-12-03 tuthamba kirinyaga kirinyaga sagana_kagio_road matatu_collided_head_on_with_a_saloon_car NaN 0 21:00
59 2023-04-18 naivasha naivasha nakuru nakuru_-_nairobi_highway_ matatu_collided_with_a_lorry NaN 5 NaN
65 2023-08-08 ndarugo ndarugo kiambu thika_road_ truck_lost_control_and_overturned NaN 0 10:00
In [41]:
# Get the top 2 value counts
top_2_values = df['victims'].value_counts().head(2).index

# Randomly choose from the top 2 values and fill the NaNs
df.loc[df['victims'].isna(), 'victims'] = np.random.choice(top_2_values, size=df['victims'].isna().sum())
In [42]:
df.isna().sum()
Out[42]:
date                             0
accident_spot                    0
area                             0
county                           0
road/highway                     0
brief_accident_details/cause     0
victims                          0
total_people_confirmed_dead      0
time_of_the_accidents           52
dtype: int64
In [43]:
# time
df['time_of_the_accidents'].unique()
Out[43]:
array(['16:30', '17:50', nan, '18:00', '12:30', '02:00', '16:20', '18:30',
       '21:00', '22:00', '23:30', '13:00', '19:30', '07:30', '10:00',
       '23:00', '04:30'], dtype=object)
In [44]:
mapping_dict = {
    '16:30': 'afternoon',
    '17:50': 'evening',
    '18:00': 'evening',
    '12:30': 'afternoon',
    '02:00': 'night',
    '16:20': 'afternoon',
    '18:30': 'evening',
    '21:00': 'night',
    '22:00': 'night',
    '23:30': 'night',
    '13:00': 'afternoon',
    '19:30': 'night',
    '07:30': 'morning',
    '10:00': 'morning',
    '23:00': 'night',
    '04:30': 'night'
}

# Replace values in the 'time_of_the_accidents' column based on the mapping
df['time_of_the_accidents'] = df['time_of_the_accidents'].replace(mapping_dict)

# Check updated value counts
df['time_of_the_accidents'].value_counts()
Out[44]:
time_of_the_accidents
night        12
afternoon     5
evening       4
morning       2
Name: count, dtype: int64
In [45]:
df[df['time_of_the_accidents'].isna()].head(10)
Out[45]:
date accident_spot area county road/highway brief_accident_details/cause victims total_people_confirmed_dead time_of_the_accidents
2 2023-07-25 ntulele ntulele narok narok_mai_mahiu_road head_on_collision drivers/occupants 4 NaN
4 2022-12-01 mutira mutira kirinyaga kerugoya-karatina_road run_over pedestrians 1 NaN
5 2022-08-06 mlima_swara mlima_swara murang'a kenol-sagana_road car_hit_a_stationary_lorry passengers 5 NaN
6 2022-07-27 nithi nithi_bridge tharaka_nithi meru_embu_road driver_lost_control_and_swerved_off_the_bridge passengers 6 NaN
7 2022-07-28 makuyu makuyu murang'a kenol-sagana_road vehicle_rammed_into_a_lorry passengers 7 NaN
8 2022-06-28 nithi nithi_bridge tharaka_nithi meru_embu_road vehicle_rolled_into_the_bridge passengers 4 NaN
9 2021-09-21 river_tungu river_tungu tharaka_nithi meru_embu_road vehicle_rolled_into_the_river passengers 5 NaN
10 2021-09-27 nithi nithi_bridge tharaka_nithi meru_embu_road vehicle_rolled_into_the_bridge passengers 4 NaN
15 2023-09-08 mlima_kiu salama makueni mombasa_-_nairobi_highway_ matatu_collided_head_on_with_an_oncoming_truck passengers 4 NaN
16 2023-04-18 naivasha delamere_farm naivasha nairobi_-_nakuru_highway head_on_collision_matatu_and_lorry passengers 6 NaN
In [46]:
df[df['road/highway'] == 'narok_mai_mahiu_road']
Out[46]:
date accident_spot area county road/highway brief_accident_details/cause victims total_people_confirmed_dead time_of_the_accidents
2 2023-07-25 ntulele ntulele narok narok_mai_mahiu_road head_on_collision drivers/occupants 4 NaN
3 2022-12-02 suswa suswa narok narok_mai_mahiu_road head_on_collision driver_and_passengers 3 evening
47 2023-08-30 nairagie-enkare narok narok narok_mai_mahiu_road bus_burst_into_flames driver_and_passengers 0 NaN
49 2023-07-25 ntulele narok narok narok_mai_mahiu_road truck_collided_head_on_with_another_truck passengers 3 NaN
50 2022-12-02 suswa suswa narok narok_mai_mahiu_road head_on_collision driver_and_passengers 1 NaN
In [47]:
# filling time based on the most frequent time for each 'road/highway'

# Group by 'road/highway' and get the most frequent time for each group
most_frequent_time = df.groupby('road/highway')['time_of_the_accidents'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)

# Fill null values in 'time of accident' with the most frequent time per 'road/highway'
df['time_of_the_accidents'] = df['time_of_the_accidents'].fillna(most_frequent_time)

# finding out the nulls after applying this formula
df['time_of_the_accidents'].isna().sum()
Out[47]:
30
In [48]:
# filling time based on the most frequent time for each 'accident_spot'

# Group by 'accident_spot' and get the most frequent time for each group
most_frequent_time = df.groupby('accident_spot')['time_of_the_accidents'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)

# Fill null values in 'time of accident' with the most frequent time per 'accident_spot'
df['time_of_the_accidents'] = df['time_of_the_accidents'].fillna(most_frequent_time)

# finding out the nulls after applying this formula
df['time_of_the_accidents'].isna().sum()
Out[48]:
25
In [49]:
# filling time based on the most frequent time for each 'area'

# Group by 'area' and get the most frequent time for each group
most_frequent_time = df.groupby('area')['time_of_the_accidents'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)

# Fill null values in 'time of accident' with the most frequent time per 'area'
df['time_of_the_accidents'] = df['time_of_the_accidents'].fillna(most_frequent_time)

# finding out the nulls after applying this formula
df['time_of_the_accidents'].isna().sum()
Out[49]:
24
In [50]:
# filling time based on the most frequent time for each 'county'

# Group by ''county' and get the most frequent time for each group
most_frequent_time = df.groupby('county')['time_of_the_accidents'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)

# Fill null values in 'time of accident' with the most frequent time per 'county'
df['time_of_the_accidents'] = df['time_of_the_accidents'].fillna(most_frequent_time)

# finding out the nulls after applying this formula
df['time_of_the_accidents'].isna().sum()
Out[50]:
21
In [51]:
# Forward fill for the remaining null values
df.loc[:, 'time_of_the_accidents'] = df['time_of_the_accidents'].ffill()

# finding out the nulls after applying this formula
df['time_of_the_accidents'].isna().sum()
Out[51]:
0
In [52]:
df = df.reset_index(drop=True)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   date                          75 non-null     datetime64[ns]
 1   accident_spot                 75 non-null     object        
 2   area                          75 non-null     object        
 3   county                        75 non-null     object        
 4   road/highway                  75 non-null     object        
 5   brief_accident_details/cause  75 non-null     object        
 6   victims                       75 non-null     object        
 7   total_people_confirmed_dead   75 non-null     Int64         
 8   time_of_the_accidents         75 non-null     object        
dtypes: Int64(1), datetime64[ns](1), object(7)
memory usage: 5.5+ KB

EDA (Exploratory Data Analysis)¶

In [53]:
# trying to see the trend of 'total_people_confirmed_dead' across the years

# Create the scatter plot
fig = px.scatter(
    df,
    x='date',  # X-axis: time
    y='total_people_confirmed_dead',  # Y-axis: total confirmed dead
    title='Total People Confirmed Dead Over Time',
    labels={'date': 'Date', 'total_people_confirmed_dead': 'Total Confirmed Dead'},
    template='plotly_dark'  # Optional: Dark theme
)

# Show the plot
fig.show()
In [54]:
# trend of victims

# Group by 'victims' category and sum the total deaths for each category
victims_death_sum = df.groupby('victims')['total_people_confirmed_dead'].sum().reset_index()

# Sort the DataFrame by total_deaths in descending order
victims_death_sum = victims_death_sum.sort_values(by='total_people_confirmed_dead', ascending=False)

# Create the bar graph
fig = px.bar(
    victims_death_sum,
    x='victims',  # X-axis: categories of victims
    y='total_people_confirmed_dead',  # Y-axis: sum of total deaths
    title='Total Deaths by Victim Category',
    labels={'victims': 'Victim Category', 'total_people_confirmed_dead': 'Total Deaths'},
    text='total_people_confirmed_dead',  # Display total deaths on top of bars
    template='plotly_dark',  # Optional: dark theme
)

# Adjust the layout for better readability
fig.update_layout(
    xaxis_tickangle=-45,  # Rotate x-axis labels for better readability
    yaxis_title='Total Deaths',
    xaxis_title='Victim Category',
)

# Show the plot
fig.show()
In [55]:
# distribution of the accident spots

fig = px.histogram(
    df,
    x='accident_spot',  # X-axis: spot (location type or any other categorical data)
    title='Distribution of Accident Spots',
    labels={'spot': 'Spot'},
    template='plotly_dark'  # Optional: dark theme
)

# Adjust the layout for better readability
fig.update_layout(
    xaxis_title='Spot',
    yaxis_title='Count',
    xaxis_tickangle=-45  # Rotate x-axis labels if necessary for better readability
)

# Show the plot
fig.show()

Model Training¶